knitr::opts_chunk$set(message = FALSE, error = FALSE)
library(tidyverse)
library(rvest)
library(magrittr)
library(lubridate)
Daily confirmed cases, deaths and recoveries broken down by country/region and state/province, when state/province is available.
confirmed_ts <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")) %>%
gather(date, confirmed, -`Province/State`, -`Country/Region`, -Lat, -Long) %>%
mutate(date = mdy(date))
deaths_ts <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")) %>%
gather(date, deaths, -`Province/State`, -`Country/Region`, -Lat, -Long) %>%
mutate(date = mdy(date))
recovered_ts <- read_csv(url("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")) %>%
gather(date, recovered, -`Province/State`, -`Country/Region`, -Lat, -Long) %>%
mutate(date = mdy(date))
joined <- confirmed_ts %>%
left_join(deaths_ts) %>%
left_join(recovered_ts) %>%
gather(stat, value, confirmed, deaths, recovered) %>%
arrange(`Country/Region`, `Province/State`, date, stat)
joined %>% mutate_if(is.character, as.factor) %>% summary()
Province/State Country/Region Lat
Diamond Princess: 330 US :40755 Min. :-41.45
Grand Princess : 330 China : 5445 1st Qu.: 27.61
Adams, IN : 165 Canada : 1815 Median : 37.81
Alabama : 165 Australia : 1485 Mean : 31.81
Alachua, FL : 165 France : 1155 3rd Qu.: 42.33
(Other) :50985 United Kingdom: 660 Max. : 71.71
NA's :24255 (Other) :25080
Long date stat value
Min. :-157.86 Min. :2020-01-22 confirmed:25465 Min. : 0.0
1st Qu.: -93.06 1st Qu.:2020-02-04 deaths :25465 1st Qu.: 0.0
Median : -74.30 Median :2020-02-18 recovered:25465 Median : 0.0
Mean : -36.13 Mean :2020-02-18 Mean : 66.8
3rd Qu.: 20.94 3rd Qu.:2020-03-03 3rd Qu.: 0.0
Max. : 174.89 Max. :2020-03-16 Max. :67798.0
#' Prepares data.frame for plotting by aggregating, weighting, reordering,
#' and filtering by location.
#'
#' @param geo_level column name to aggregate (sum) by
#' @param dcr_wts weights to order locations; vector of c(deaths, confirmed, recovered)
#' @param show_top_n show top n locations ordered by dcr_wts
#'
#' @return an aggregated, reordered, and filtered by location data.frame
#'
#' @examples
#' plot_prep("Country/Region", c(100, 1, -1), show_top_n = 25)
#'
#' @export
plot_prep <- function(df, geo_level = "Country/Region", dcr_wts = c(1, 0.01, 0),
show_top = 1:10) {
df %>%
mutate(location = !!sym(geo_level)) %>%
# aggregate by location
group_by(location, date, stat) %>%
summarise(value = sum(value)) %>%
ungroup() %>%
# location order
group_by(location) %>%
mutate(
total_wt =
value[which(date == max(date) & stat == "deaths")] * dcr_wts[1] +
value[which(date == max(date) & stat == "confirmed")] * dcr_wts[2] +
value[which(date == max(date) & stat == "recovered")] * dcr_wts[3]
) %>%
ungroup() %>%
mutate(total_rank = dense_rank(desc(total_wt))) %>%
# filter top n
filter(total_rank %in% show_top) %>%
# order by rank
mutate(location = fct_reorder(location, total_rank))
}
#' Convenience function to plot country comparisons
plot_country_comps <- function(df) {
df %>%
ggplot(aes(date, value, color=location)) +
geom_line() + facet_wrap(vars(stat), ncol=1, scales = "free_y") +
theme(legend.position = "right", legend.title = element_blank()) +
xlab(element_blank())
}
joined %>%
plot_prep("Country/Region") %>%
plot_country_comps()
joined %>%
filter(!`Country/Region` %in% c("China")) %>%
plot_prep("Country/Region") %>%
plot_country_comps()
joined %>%
filter(!`Country/Region` %in% c("China", "Italy", "Iran", "Korea, South")) %>%
plot_prep("Country/Region") %>%
plot_country_comps()
joined %>%
filter(!`Country/Region` %in% c("China", "Italy", "Iran", "Korea, South",
"Spain", "France")) %>%
plot_prep("Country/Region") %>%
plot_country_comps()
#' Convenience function to plot growth comparisons
plot_growth_comps <- function(df) {
df %>%
ggplot(aes(date, value)) + geom_line() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
facet_wrap(
vars(location, stat), scales="free_y", ncol = 3,
labeller=label_wrap_gen(multi_line=FALSE)) +
theme(strip.background = element_blank(), legend.position = "None") +
xlab(element_blank())
}
joined %>%
plot_prep() %>%
plot_growth_comps()
joined %>%
filter(`Country/Region` == "China") %>%
plot_prep("Province/State") %>%
plot_growth_comps()
joined %>%
filter(`Country/Region` == "US") %>%
plot_prep("Province/State") %>%
plot_growth_comps()
joined %>%
filter(`Country/Region` == "Canada") %>%
plot_prep("Province/State") %>%
plot_growth_comps()
joined %>%
filter(`Country/Region` == "Australia") %>%
plot_prep("Province/State") %>%
plot_growth_comps()
plot_deaths_since_first <- function(df, days_since) {
df %>%
group_by(`Country/Region`) %>%
mutate(total_deaths = value[which(date == max(date) & stat == "deaths")][1]) %>%
filter(total_deaths > 0) %>%
plot_prep("Country/Region") %>%
arrange(location, stat, date) %>%
group_by(location) %>%
mutate(first_death_date = date[which(value != 0 & stat == "deaths")][1]) %>%
mutate(`Days since first death` = date - first_death_date) %>%
filter(stat == "deaths" & value > 0 &
`Days since first death` <= days_since) %>%
ggplot(aes(`Days since first death`, value, color=location)) + geom_line() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
theme(strip.background = element_blank(), legend.position = "right",
legend.title = element_blank()) +
ylab("Total deaths")
}
joined %>%
plot_deaths_since_first(10)
joined %>%
plot_deaths_since_first(30)
joined %>%
plot_deaths_since_first(90)
joined %>%
plot_prep(dcr_wts = c(100, 0.1, 0), show_top = 1:10) %>%
group_by(location, stat) %>%
arrange(date) %>%
mutate(
`Days to double` = if_else(
value < 10, as.difftime("", units="days"),
date - date[sapply(value, function(x) { which.min(abs(x / 2 - value)) })])
) %>%
ggplot(aes(date, `Days to double`, color=stat)) +
geom_point(alpha = 0.5) +
geom_smooth() +
#facet_wrap(vars(location), ncol = 3) +
facet_grid(vars(location), vars(stat), scales="free") +
theme(legend.position = "bottom", legend.title=element_blank()) +
xlab(element_blank())
joined %>%
filter(`Country/Region` != "China") %>%
plot_prep(dcr_wts = c(100, 0.1, 0), show_top = 1:10) %>%
group_by(location, stat) %>%
arrange(date) %>%
mutate(
`Days to double` = if_else(
value < 10, as.difftime("", units="days"),
date - date[sapply(value, function(x) { which.min(abs(x / 2 - value)) })])
) %>%
ggplot(aes(date, `Days to double`, color=stat)) +
geom_point(alpha = 0.5) +
geom_smooth() +
#facet_wrap(vars(location), ncol = 3) +
facet_grid(vars(location), vars(stat), scales="free") +
theme(legend.position = "bottom", legend.title=element_blank()) +
xlab(element_blank())
h/t: @loeserjohn
n_deaths <- 10
min_deaths <- n_deaths * 2
top_n_countries <- 1:20
trunc_to_n <- 2
geo_level = "Country/Region"
# TODO(roboton): Refactor into prep and plot functions
joined %>%
# plot prep at country level
plot_prep(geo_level, show_top = top_n_countries) %>%
# limit to deaths
filter(stat == "deaths") %>%
# drop countries with less than min_deaths
group_by(location) %>%
filter(value[which.max(date)] >= min_deaths) %>%
ungroup() %>%
# get the first date with more than or equal to n_deaths
group_by(location) %>%
mutate(nth_death_date = date[which(value >= n_deaths) - 1][1]) %>%
mutate(`Days since nth death` = date - nth_death_date) %>%
# Drop days before nth deaths date
filter(`Days since nth death` > 0) %>%
# Compute Days to Double
group_by(location) %>%
mutate(
# set days to double
double_idx = sapply(value, FUN=function(x) { max(which(value <= x/2)) }),
`Days to double` = date - date[double_idx]
) %>%
filter(!is.na(`Days to double`)) %>%
## plot
# truncate to second longest time series
group_by(location) %>%
mutate(max_days_since_nth_death = max(`Days since nth death`)) %>%
ungroup() %>%
mutate(max_days_rank = dense_rank(desc(max_days_since_nth_death))) %>%
filter(`Days since nth death` <= max(`Days since nth death`[max_days_rank == trunc_to_n])) %>%
group_by(location) %>%
mutate(time_points = n()) %>%
ungroup() %>%
filter(time_points > 2) %>%
ggplot(aes(`Days since nth death`, `Days to double`, color=location)) +
geom_point(alpha = 0.2) +
geom_smooth(method="auto", se=F) +
xlab(paste("days since deaths =", n_deaths)) -> p
ggplotly(p)
n_deaths <- 10
min_deaths <- n_deaths * 2
top_n_countries <- 1:15
trunc_to_n <- 1
geo_level = "Province/State"
# TODO(roboton): Refactor into prep and plot functions
joined %>%
filter(!is.na(`Province/State`) & `Province/State` != `Country/Region`) %>%
# plot prep at country level
plot_prep(geo_level, show_top = top_n_countries) %>%
# limit to deaths
filter(stat == "deaths") %>%
# drop countries with less than min_deaths
group_by(location) %>%
filter(value[which.max(date)] >= min_deaths) %>%
ungroup() %>%
# get the first date with more than or equal to n_deaths
group_by(location) %>%
mutate(nth_death_date = date[which(value >= n_deaths) - 1][1]) %>%
mutate(`Days since nth death` = date - nth_death_date) %>%
# Drop days before nth deaths date
filter(`Days since nth death` > 0) %>%
# Compute Days to Double
group_by(location) %>%
mutate(
# set days to double
double_idx = sapply(value, FUN=function(x) { max(which(value <= x/2)) }),
`Days to double` = date - date[double_idx]
) %>%
filter(!is.na(`Days to double`)) %>%
## plot
# truncate to second longest time series
group_by(location) %>%
mutate(max_days_since_nth_death = max(`Days since nth death`)) %>%
ungroup() %>%
mutate(max_days_rank = dense_rank(desc(max_days_since_nth_death))) %>%
filter(`Days since nth death` <= max(`Days since nth death`[max_days_rank == trunc_to_n])) %>%
group_by(location) %>%
mutate(time_points = n()) %>%
ungroup() %>%
filter(time_points > 2) %>%
ggplot(aes(`Days since nth death`, `Days to double`, color=location)) +
geom_point(alpha = 0.2) +
geom_smooth(method="auto", se=F) +
xlab(paste("days since deaths =", n_deaths)) -> p
ggplotly(p)